package com.charm.utils;

import java.util.Map;
import java.util.List;
import java.util.Vector;
import java.util.HashMap;
import java.util.Comparator;
import java.util.Collections;

import java.io.FileReader;
import java.io.BufferedReader;

import com.charm.utils.CmAnsj.CmWord;

/**
 * 输出文章降序后每个词tf-idf值向量集
 * @author gonglibin
 * 2017.08.22
 */

public class CmTfIdf {
	private final int CM_IKEY = 0X00;
	private final int CM_IVAL = 0X01;
	
	private Map<String, Double> idf = new HashMap<>();
	
	public static final String CM_FIDF = "./config/CmTfIdfConfig.dat";
	
	/**
	 * CmTfIdf构造函数
	 * @param f idf配置文件路径
	 * @return 无
	 */
	public CmTfIdf() {}
	public CmTfIdf(String f) {
		try {
			String buf = new String();
			BufferedReader br = new BufferedReader(new FileReader(f));
			
			while (null != (buf = br.readLine())) {
				String[] arr = buf.split("\t");
				idf.put(arr[CM_IKEY], new Double(arr[CM_IVAL]));
			}
			
			br.close();
		} catch (Exception e) {
			e.printStackTrace();
		}
	}
	
	/**
	 * 计算每个词语的tf-idf值
	 * @param l List<CmWord>对象
	 * @param n 内容标题正文词汇数
	 * @return 排序后的tf-idf向量集
	 */
	public Vector<Map.Entry<String, Double>> CmTfIdfTurn(List<CmWord> l, Integer n) {
		Double val = null;
		Integer num = 0 != n ? n : l.size();
		Map<String, Double> doc = new HashMap<>();
		Vector<Map.Entry<String, Double>> rst = new Vector<>();
		
		/* 统计该词语文档词频 */
		for (CmWord w : l) {
			doc.put(w.val, null != (val = doc.get(w.val)) ? val + 1 : 1);
		}
		
		/* tf-idf = tf * idf */
		for (Map.Entry<String, Double> v : doc.entrySet()) {
			if (null != (val = idf.get(v.getKey()))) {
				v.setValue(v.getValue() / num * val);
				rst.add(v);
			}
		}
		
		/* tf-idf结果降序排序 */
		Collections.sort(rst, new Comparator<Map.Entry<String, Double>>() {
			public int compare(Map.Entry<String, Double> n, Map.Entry<String, Double> m) {
				return m.getValue().compareTo(n.getValue());
			}
		});
		
		return rst;
	}
	
	/**
	 * 打印输出词语的tf-idf值
	 * @param v 结果向量集合
	 * @param n 打印输出个数
	 * @return 无
	 */
	public void CmTfIdfPrint(Vector<Map.Entry<String, Double>> v, int n) {
		for (int idx = 0; idx < v.size() && idx < n; ++ idx) {
			System.out.print(v.get(idx).getKey() + ": [" + v.get(idx).getValue() + "], ");
		}
		System.out.println();
	}
}